library(data.table)
library(R.utils)
library(stringr)
library(ggcorrplot)
theme_set(theme_bw())

Processing data

fnm1 = "GSE120575_Sade_Feldman_melanoma_single_cells_nolabel_1.txt.gz"
fnm2 = "GSE120575_Sade_Feldman_melanoma_single_cells_2.txt.gz"

sf_tpm1 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm1), 
                fill = TRUE, header = TRUE)

sf_tpm2 = fread(paste0("../../scRNAseq/Sade_Feldman_2018/", fnm2), 
                fill = TRUE, drop = 16293, col.names = colnames(sf_tpm1))

ls_tpm = list(sf_tpm1,sf_tpm2)
rm(sf_tpm1)
rm(sf_tpm2)

sf_tpm = rbindlist(ls_tpm)

rm(ls_tpm)
dim(sf_tpm) # 55737 x 16292
## [1] 55737 16292
sf_tpm[1:2,1:4]
gene.names = sf_tpm$V1
rownames(sf_tpm) = gene.names
sf_tpm$V1 = NULL

Only select classified cells.

cell_type = read.table("./output/cell_type.txt", header = TRUE, 
                       sep = "\t", as.is = TRUE)
dim(cell_type)
## [1] 16291     4
cell_type[1:2,]
table(cell_type$type, useNA="ifany")
## 
##               B_cells           CD4T_memory                CD8T_B 
##                  1427                  1099                  2322 
##                CD8T_G       Dendritic_cells Monocytes_Macrophages 
##                  3036                   281                  1330 
##                   NK1                   NK2                   NK3 
##                   456                  1961                  1096 
##          Plasma_cells                 Tregs           unclustered 
##                   284                  1252                  1747
clustered_cells = cell_type$cell[cell_type$type != "unclustered"]
cluster_sf_tpm  = sf_tpm[, ..clustered_cells]
rownames(cluster_sf_tpm) = gene.names

dim(cluster_sf_tpm)
## [1] 55737 14544
cluster_sf_tpm[1:3,1:3]
rm(sf_tpm)
rm(clustered_cells)

Select ~10,000 genes expressed in at least a proportion of cells

Calculate the proportion of cells that each gene is expressed in

cellprop.per.gene = rowMeans(cluster_sf_tpm > 0)
table(cellprop.per.gene > 0.02)
## 
## FALSE  TRUE 
## 41139 14598
he.genes = gene.names[cellprop.per.gene > 0.02]
HEgenes_cluster_tpm = cluster_sf_tpm[cellprop.per.gene > 0.02, ]
rownames(HEgenes_cluster_tpm) = he.genes
dim(HEgenes_cluster_tpm)
## [1] 14598 14544
rm(cluster_sf_tpm)
rm(cellprop.per.gene)

Read in the two versions of signature genes

sig_gene = fread("./output/SF2018_sig_gene_matrix.txt")

dim(sig_gene)
## [1] 594  12
sig_gene[1:2,]

Check the correlations between cell types using this signature matrix.

sig_mtrx = data.matrix(sig_gene[, -1])
dim(sig_mtrx)
## [1] 594  11
sig_mtrx[1:2,]
##         B_cells CD4T_memory    CD8T_B    CD8T_G Dendritic_cells
## [1,] 0.04348984   0.1219290 0.7313351 0.3241238     0.008256228
## [2,] 0.02385424   0.2537671 2.1449182 0.4139888     0.079608541
##      Monocytes_Macrophages       NK1       NK2       NK3 Plasma_cells     Tregs
## [1,]            0.05795489 0.6226316 0.2564559 0.7853011   0.08524648 0.4924681
## [2,]            0.35542857 2.4847149 0.5287404 1.8980383   0.59714789 3.4090974
cr1 = cor(sig_mtrx)
ggcorrplot(cr1, tl.cex = 6)

round(cr1[c(2:4,7:9),c(2:4,7:9)], 2)
##             CD4T_memory CD8T_B CD8T_G  NK1  NK2  NK3
## CD4T_memory        1.00   0.23   0.55 0.18 0.66 0.40
## CD8T_B             0.23   1.00   0.82 0.57 0.67 0.89
## CD8T_G             0.55   0.82   1.00 0.31 0.84 0.79
## NK1                0.18   0.57   0.31 1.00 0.35 0.50
## NK2                0.66   0.67   0.84 0.35 1.00 0.84
## NK3                0.40   0.89   0.79 0.50 0.84 1.00

Read in the details of DE analysis

sig_info = fread("./output/SF2018_sig_gene_details.txt")

dim(sig_info)
## [1] 14598    56
sig_info[1:2,c(1:4,(ncol(sig_info)-3):ncol(sig_info)), with=F]

illustrate the results for a few fenes

cluster_ct = cell_type$type[match(names(HEgenes_cluster_tpm), cell_type$cell)]
marker_col = grep("^marker_", names(sig_info), value = TRUE)
u_col  = grep("^u_", names(sig_info), value = TRUE)
fc_col = grep("^fc_", names(sig_info), value = TRUE)

plot1 <- function(gene1, edata, cluster_ct, sig_info){
  y      = as.numeric(edata[which(rownames(edata) == gene1),])
  df1    = data.frame(expression = y, cell_type = cluster_ct)
  
  g1 = ggplot(df1, aes(x=cell_type, y=expression, fill=cell_type)) +
    geom_boxplot() + coord_flip() + theme(legend.position = "none") +
    ggtitle(gene1)
  print(g1)
  
  ct = marker_col[unlist(sig_info[gene==gene1, marker_col, with=FALSE])==1]
  print(ct)
  
  sig_info[gene==gene1, c("gene", u_col), with=F]
  sig_info[gene==gene1, c("gene", fc_col), with=F]
}

plot1("FOXP3", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_Tregs"
plot1("GZMA",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## character(0)
plot1("PDCD1", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B"
plot1("CTLA4", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_Tregs"
plot1("LAG3",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B"
plot1("CD3E",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## character(0)
plot1("CD4",   HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_Tregs"
plot1("CD8A",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B" "marker_CD8T_G"
plot1("CD33",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_Monocytes_Macrophages"
plot1("CD14",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_Monocytes_Macrophages"
plot1("CD19",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_B_cells"
plot1("LMNA",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## character(0)
plot1("FASLG", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B" "marker_NK3"
plot1("VCAM1", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B" "marker_NK3"
plot1("LEF1",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD4T_memory"
sig_info$gene[sig_info$marker_NK1 == 1]
##  [1] "ANLN"          "ASF1B"         "ASPM"          "AURKB"        
##  [5] "BIRC5"         "BUB1"          "BUB1B"         "CASC5"        
##  [9] "CCNA2"         "CCNB2"         "CDC20"         "CDC45"        
## [13] "CDCA2"         "CDCA3"         "CDCA5"         "CDCA8"        
## [17] "CDK1"          "CDKN3"         "CIT"           "CKAP2L"       
## [21] "DEPDC1B"       "DHFRP1"        "DLGAP5"        "DTL"          
## [25] "E2F2"          "ESCO2"         "GTSE1"         "HIST1H3B"     
## [29] "HIST1H3G"      "HJURP"         "KIAA0101"      "KIF11"        
## [33] "KIF15"         "KIF18B"        "KIF23"         "KIF2C"        
## [37] "KIFC1"         "MELK"          "MKI67"         "MLF1IP"       
## [41] "NCAPG"         "NUF2"          "PBK"           "PKMYT1"       
## [45] "PLK1"          "POLQ"          "RAD51"         "RP11-424C20.2"
## [49] "RRM2"          "SKA1"          "SPC24"         "SPC25"        
## [53] "TK1"           "TOP2A"         "TPX2"          "TYMS"         
## [57] "UBE2C"         "UHRF1"         "ZWINT"
plot1("CCNA2", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B" "marker_NK1"
plot1("CCNB2", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK1"
plot1("KIF11", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK1"
sig_info$gene[sig_info$marker_NK2 == 1]
##  [1] "ADRB2"         "AE000661.37"   "B3GNT7"        "BIN2P1"       
##  [5] "C1orf21"       "CA5B"          "CDC34"         "COLQ"         
##  [9] "CX3CR1"        "EPHA4"         "FGFBP2"        "GNLY"         
## [13] "GOLGA8I"       "IL18RAP"       "KIR2DL3"       "KIR2DS4"      
## [17] "KIR3DL1"       "KIR3DL2"       "KLRB1"         "KLRC1"        
## [21] "KLRF1"         "LGR6"          "LINC00299"     "MATK"         
## [25] "MMP25"         "MPP7"          "MYBL1"         "NCAM1"        
## [29] "NCR1"          "NMUR1"         "OSBPL5"        "P2RY8"        
## [33] "PDGFD"         "PDZD8"         "PITPNC1"       "PLEKHG3"      
## [37] "PRKX"          "PRKY"          "PRSS23"        "PTGDR"        
## [41] "PZP"           "RAP1GAP2"      "RP11-473M20.7" "RP11-53I6.2"  
## [45] "S1PR5"         "SEMA4C"        "SH2D1B"        "SOCS2"        
## [49] "SPON2"         "TGFBR3"        "TMIGD2"        "TNFSF14"      
## [53] "TRDC"          "TRGV7"         "TRGV9"         "TSPAN32"      
## [57] "TXK"           "YES1"          "ZBTB16"
plot1("FGFBP2",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2"
plot1("KIR2DL3", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2"
plot1("KLRC1", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2"
plot1("NCR1",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2" "marker_NK3"
plot1("GNLY",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2"
plot1("TRDC",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2" "marker_NK3"
plot1("TRGV7", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2"
plot1("TRGV9", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK2" "marker_NK3"
sig_info$gene[sig_info$marker_NK3 == 1]
##  [1] "AC006460.2"     "AC069363.1"     "AC092580.4"     "ACTBP12"       
##  [5] "AE000661.37"    "AFAP1L2"        "B3GAT1"         "CADM1"         
##  [9] "CAV1"           "CD200R1"        "CHI3L2"         "CPNE7"         
## [13] "CRTAM"          "CTD-2313F11.1"  "CXCR6"          "DAPK2"         
## [17] "DBN1"           "DTHD1"          "DTX3"           "ENC1"          
## [21] "F2R"            "FAM179A"        "FASLG"          "FXYD2"         
## [25] "GNG4"           "GZMK"           "HAVCR1"         "HAVCR2"        
## [29] "IKZF2"          "JAKMIP1"        "KIAA1671"       "KIR2DL4"       
## [33] "KIR3DL2"        "LINC00484"      "LINC00539"      "NCR1"          
## [37] "NDFIP2"         "RDH10"          "RP11-222K16.2"  "RP11-277P12.20"
## [41] "RP11-305L7.1"   "RP11-4F5.2"     "RTKN2"          "SLC27A2"       
## [45] "TIMD4"          "TMEM155"        "TRDC"           "TRDV1"         
## [49] "TRGC1"          "TRGC2"          "TRGV2"          "TRGV4"         
## [53] "TRGV5"          "TRGV8"          "TRGV9"          "TSC22D1"       
## [57] "TTC24"          "VCAM1"          "ZNF80"
plot1("GZMK", HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("HAVCR1",   HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("KIR2DL4",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_CD8T_B" "marker_NK3"
plot1("TRGC1",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("TRGC2",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("TRGV2",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("TRGV4",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("TRGV5",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
plot1("TRGV8",  HEgenes_cluster_tpm, cluster_ct, sig_info)

## [1] "marker_NK3"
gc()
##             used   (Mb) gc trigger   (Mb) limit (Mb)   max used    (Mb)
## Ncells   1095531   58.6    3236734  172.9         NA    3236734   172.9
## Vcells 305852484 2333.5 1090273317 8318.2      32768 2129439458 16246.4
sessionInfo()
## R version 3.6.2 (2019-12-12)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS Catalina 10.15.7
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.6/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
## [1] ggcorrplot_0.1.3  ggplot2_3.3.1     stringr_1.4.0     R.utils_2.9.2    
## [5] R.oo_1.23.0       R.methodsS3_1.8.0 data.table_1.12.8
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_1.0.3       plyr_1.8.5       compiler_3.6.2   pillar_1.4.3    
##  [5] tools_3.6.2      digest_0.6.23    jsonlite_1.6.1   evaluate_0.14   
##  [9] lifecycle_0.2.0  tibble_3.0.1     gtable_0.3.0     pkgconfig_2.0.3 
## [13] rlang_0.4.6      yaml_2.2.1       xfun_0.12        withr_2.1.2     
## [17] dplyr_0.8.4      knitr_1.28       vctrs_0.3.0      grid_3.6.2      
## [21] tidyselect_1.0.0 glue_1.3.1       R6_2.4.1         rmarkdown_2.1   
## [25] farver_2.0.3     reshape2_1.4.3   purrr_0.3.3      magrittr_1.5    
## [29] scales_1.1.0     htmltools_0.4.0  ellipsis_0.3.0   assertthat_0.2.1
## [33] colorspace_1.4-1 labeling_0.3     stringi_1.4.5    munsell_0.5.0   
## [37] crayon_1.3.4